{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re, sys, os, csv\n",
    "from many_stop_words import get_stop_words\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#stop_words = list(get_stop_words('en'))         #About 900 stop words\n",
    "#nltk_words = list(stopwords.words('english'))   #About 150 stop words\n",
    "#stop_words.extend(nltk_words)\n",
    "\n",
    "def word_prob(word): return dictionary[word] / total\n",
    "def words(text): return re.findall('[a-z]+', text.lower())\n",
    "dictionary = Counter(words(open('dataset/wordlists/merged.txt').read()))\n",
    "max_word_length = max(map(len, dictionary))\n",
    "total = float(sum(dictionary.values()))\n",
    "\n",
    "def viterbi_segment(text):\n",
    "    probs, lasts = [1.0], [0]\n",
    "    for i in range(1, len(text) + 1):\n",
    "        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)\n",
    "                        for j in range(max(0, i - max_word_length), i))\n",
    "        probs.append(prob_k)\n",
    "        lasts.append(k)\n",
    "    words = []\n",
    "    i = len(text)\n",
    "    while 0 < i:\n",
    "        words.append(text[lasts[i]:i])\n",
    "        i = lasts[i]\n",
    "    words.reverse()\n",
    "    return words, probs[-1]\n",
    "\n",
    "def fix_hashtag(text):\n",
    "    text = text.group().split(\":\")[0]\n",
    "    text = text[1:] # remove '#'\n",
    "    try:\n",
    "        test = int(text[0])\n",
    "        text = text[1:]\n",
    "    except:\n",
    "        pass\n",
    "    output = ' '.join(viterbi_segment(text)[0])\n",
    "    #print(output)\n",
    "    return output\n",
    "\n",
    "def clean_tweet( tweet):\n",
    "        tweet = tweet.lower()\n",
    "        tweet = re.sub(\"(#[A-Za-z0-9]+)\", fix_hashtag, tweet)\n",
    "        return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\", \" \", tweet).split())\n",
    "\n",
    "def remove_stopwords(word_list):\n",
    "        filtered_tweet=\"\"\n",
    "        for word in word_list:\n",
    "            word = word.lower() \n",
    "            if word not in stopwords.words(\"english\"):\n",
    "                filtered_tweet=filtered_tweet + \" \" + word\n",
    "        \n",
    "        return filtered_tweet.lstrip()\n",
    "    \n",
    "def vectorise_label(label):\n",
    "    if label == \"empty\":return 0\n",
    "    elif label == \"sadness\":return 3\n",
    "    elif label == \"enthusiasm\":return 1\n",
    "    elif label == \"neutral\":return 0\n",
    "    elif label == \"worry\":return 3\n",
    "    elif label == \"surprise\":return 2\n",
    "    elif label == \"love\":return 2\n",
    "    elif label == \"fun\":return 1\n",
    "    elif label == \"hate\":return 4\n",
    "    elif label == \"happiness\":return 1\n",
    "    elif label == \"boredom\":return 0\n",
    "    elif label == \"relief\":return 1\n",
    "    elif label == \"anger\":return 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_csv(file, lst):\n",
    "    with open(file, newline='') as f:\n",
    "        reader = csv.reader(f)\n",
    "        for i,row in enumerate(reader):\n",
    "            if i > 0:\n",
    "                tweet = clean_tweet(str(row[1]))\n",
    "                lst.append(tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "hate = []\n",
    "read_csv(\"hate.csv\", hate)\n",
    "\"\"\"worry = []\n",
    "read_csv(\"raw_data/worried.csv\", worry)\n",
    "#read_csv(\"worry.csv\", worry)\n",
    "read_csv(\"raw_data/anxiety.csv\", worry)\n",
    "happy = []\n",
    "read_csv(\"raw_data/happy.csv\", happy)\n",
    "sad = []\n",
    "read_csv(\"raw_data/sad.csv\", sad)\"\"\"\n",
    "\n",
    "dataWriter = csv.writer(open('hate_processes.csv', 'w'), delimiter=',',lineterminator=\"\\n\")\n",
    "for tweet in hate:\n",
    "    dataWriter.writerow([tweet, 8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5309\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "with open('data_new.csv') as csvfile:\n",
    "    readCSV = csv.reader(csvfile, delimiter=',')\n",
    "    for row in readCSV:\n",
    "        count+=1\n",
    "print(count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
